scaled_physeq.RData, which includes a
rooted tree that we created in
analysis/06_Ordination/scaled_physeq.RData.## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2535 taxa and 88 samples ]
## sample_data() Sample Data: [ 88 samples by 43 sample variables ]
## tax_table() Taxonomy Table: [ 2535 taxa by 9 taxonomic ranks ]
## phy_tree() Phylogenetic Tree: [ 2535 tips and 2534 internal nodes ]
## [1] 1942
## [1] 1942 1990
Calculate the relative abundance of the phyla across all the samples.
NOTE #1: The data MUST be normalized for this comparison. Here, we’ve scaled the read counts to 1,942.
NOTE #2:
# Create a phylum level dataframe
phylum_df <-
scaled_physeq %>%
# Agglomerate all ASV counts within a phylum
tax_glom(taxrank = "Phylum") %>%
# Calculate the relative abundance!
transform_sample_counts(function(x) {x/sum(x)}) %>%
# Create a dataframe from phyloseq object
psmelt() %>%
# Filter out Phyla < 1 %
#dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
# Fix the station order
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Make a list of phyla the top phyla
top10_phyla <-
phylum_df %>%
group_by(Phylum) %>%
summarize(mean_Abundance = mean(Abundance)) %>%
arrange(-mean_Abundance) %>%
head(n = 10) %>%
pull(Phylum)I’m personally against stacked bar plots because I think there are much more useful plots out there. However, most people want to plot them, show below is some code to plot it with your data.
# Stacked Bar Plot With All phyla
# Plot Phylum Abundances - make sure to load phylum_colors
phylum_df %>%
dplyr::filter(Phylum %in% top10_phyla) %>%
# Warning: It's important to have one sample per x value,
# otherwise, it will take the sum between multiple samples
dplyr::filter(depth == 0.0) %>%
dplyr::filter(fraction == "Whole") %>%
ggplot(aes(x = date, y = Abundance, fill = Phylum)) +
facet_grid(.~station) +
geom_bar(stat = "identity", color = "black") +
labs(title = "Top 10 Phyla: Surface Samples") +
scale_fill_manual(values = phylum_colors) +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))A plot that’s still not great (in my opinion) but better than stacked bar plots is a faceted bar plot. Below, we can explore what these plots look like with the class data:
phylum_df %>%
dplyr::filter(Phylum %in% top10_phyla) %>%
# Individual sample comparison of surface waters, whole fraction
# This will allow us to look at individual samples!
# Note that whenever plotting stacked bar plots, you should always look at Individual samples!
dplyr::filter(depth == 0.0) %>%
dplyr::filter(fraction == "Whole") %>%
ggplot(aes(x = station, y = Abundance, fill = Phylum)) +
facet_grid(Phylum~date, scale = "free") +
# add the stacked bar
geom_bar(stat = "identity", color = "black") +
# change the colors to be our selected colors
scale_fill_manual(values = phylum_colors) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))### Or combined together:
phylum_df %>%
dplyr::filter(Phylum %in% top10_phyla) %>%
ggplot(aes(x = station, y = Abundance, fill = Phylum, color = Phylum)) +
facet_grid(Phylum~date, scale = "free") +
# add the stacked bar
geom_jitter() +
geom_boxplot(outlier.shape = NA, alpha = 0.5) +
# change the colors to be our selected colors
scale_fill_manual(values = phylum_colors) +
scale_color_manual(values = phylum_colors) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))Let’s collapse all the data into a single plot
### Or combined together:
phylum_df %>%
dplyr::filter(Phylum %in% top10_phyla) %>%
ggplot(aes(x = station, y = Abundance, fill = Phylum, color = Phylum)) +
facet_wrap(Phylum~., scales = "free", nrow = 2) +
# add the stacked bar
geom_jitter() +
geom_boxplot(outlier.shape = NA, alpha = 0.5) +
# change the colors to be our selected colors
scale_fill_manual(values = phylum_colors) +
scale_color_manual(values = phylum_colors) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))Clearly there are some key differences in the composition of the communities above!
Phyla that “clearly” (visual eye) vary by station:
# Narrow in on a specific group
# Actinomycetota - y: abundance, x: station, dot plot + boxplot
phylum_df %>%
dplyr::filter(Phylum == "Actinomycetota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinobacteriota Phylum Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "right")# Statistically: Kruskall-Wallis followed by a Tukey's Posthoc test
# These are non-parametric (non-normal) stat tests
# CONTINUOUS
phylum_df %>%
dplyr::filter(Phylum == "Actinomycetota") %>%
# PLOT IT FIRST BEFORE ADDING THIS LINE OF CODE
#dplyr::filter(salinity_psu < 25) %>%
# build the plot
ggplot(aes(x = salinity_psu, y = Abundance)) +
geom_point(aes(color = station)) +
theme_bw() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
labs(title = "Actinobacteriota Phylum Abundance") +
scale_color_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "right")# Narrow in on a specific group
# Actinomycetota - y: abundance, x: station, dot plot + boxplot
phylum_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteriota Phylum Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "right")# Statistically: Kruskall-Wallis followed by a Tukey's Posthoc test
# These are non-parametric (non-normal) stat tests
# CONTINUOUS
phylum_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
# build the plot
ggplot(aes(x = salinity_psu, y = Abundance)) +
geom_point(aes(color = station, shape = date)) +
theme_bw() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
labs(title = "Cyanobacteriota Phylum Abundance") +
scale_color_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "right")# Calculate the Family relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
family_df <-
scaled_physeq %>%
# agglomerate at the phylum level
tax_glom(taxrank = "Family") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# Filter out Phyla that are <1% - get rid of low abundant Phyla
dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Check family_df
#str(family_df)
# Plot family
# Actinobacteriota Families
family_df %>%
dplyr::filter(Phylum == "Actinomycetota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Family, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinomycetota Family Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Plot family
family_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Family, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteriota Family Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Calculate the Family relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
genus_df <-
scaled_physeq %>%
# agglomerate at the phylum level
tax_glom(taxrank = "Genus") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# Filter out Phyla that are <1% - get rid of low abundant Phyla
dplyr::filter(Abundance > 0.01) %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Actinobacteriota
# Plot genus
genus_df %>%
dplyr::filter(Phylum == "Actinomycetota") %>%
#dplyr::filter(Family == "Sporichthyaceae") %>%
#dplyr::filter(Genus == "hgcI clade") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Genus, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinomycetota hgc1 Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Cyanobacteriota
# Plot genus
genus_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
dplyr::filter(Genus %in% c("Cyanobium PCC-6307", "Synechococcus CC9902")) %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~Genus, scales = "free_y", nrow = 1) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteriota Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")Cyanobium PCC-6307
Synechococcus CC9902:
# Calculate the Family relative abundance
# Note: The read depth MUST be normalized in some way: scale_reads
ASV_df <-
scaled_physeq %>%
# Prune out ASVs that have fewer than 100 counts!
## LOOK AT HOW MANY ARE REMOVED! We scaled to 1,962 reads!
prune_taxa(taxa_sums(.) >= 196, .) %>%
# agglomerate at the phylum level
tax_glom(taxrank = "ASV") %>%
# Transform counts to relative abundance
transform_sample_counts(function (x) {x/sum(x)}) %>%
# Melt to a long format
psmelt() %>%
# fix the order of date
mutate(date = fct_relevel(date, c("6/2/21", "6/15/21", "10/5/21")),
station = fct_relevel(station, c("Copano West", "Copano East",
"Mesquite Bay", "Aransas Bay",
"Shipping Channel")))
# Actinobacteriota
# Plot genus
ASV_df %>%
dplyr::filter(Phylum == "Actinomycetota") %>%
dplyr::filter(Genus == "hgcI clade") %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(.~ASV, scales = "free_y", nrow = 2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Actinomycetota hgc1 Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")# Cyanobacteriota
# Plot genus
ASV_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
dplyr::filter(Genus %in% c("Cyanobium PCC-6307", "Synechococcus CC9902")) %>%
# build the plot
ggplot(aes(x = station, y = Abundance,
fill = station, color = station)) +
facet_wrap(Genus~ASV, scales = "free_y", nrow = 3) +
geom_boxplot(alpha = 0.5, outlier.shape = NA) + # outliers not plotted here in boxplot
geom_jitter() +
theme_bw() +
labs(title = "Cyanobacteriota Genus Abundance") +
scale_color_manual(values = station_colors) +
scale_fill_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")ASV_df %>%
dplyr::filter(Phylum == "Cyanobacteriota") %>%
dplyr::filter(ASV %in% c("ASV_0001", "ASV_0002", "ASV_0025")) %>%
# build the plot
ggplot(aes(x = salinity_psu, y = Abundance)) +
facet_wrap(Genus~ASV, scales = "free_y") +
geom_point(aes(color = station)) +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
theme_bw() +
labs(title = "Cyanobacteriota Genus Abundance") +
scale_color_manual(values = station_colors) +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1),
legend.position = "bottom")For reproducibility
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.3 (2023-03-15)
## os Rocky Linux 9.4 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2025-04-22
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## ade4 1.7-23 2025-02-14 [1] CRAN (R 4.2.3)
## ape 5.8-1 2024-12-16 [1] CRAN (R 4.2.3)
## Biobase 2.58.0 2022-11-01 [1] Bioconductor
## BiocGenerics 0.44.0 2022-11-01 [1] Bioconductor
## biomformat 1.26.0 2022-11-01 [1] Bioconductor
## Biostrings 2.66.0 2022-11-01 [1] Bioconductor
## bitops 1.0-9 2024-10-03 [2] CRAN (R 4.2.3)
## bslib 0.9.0 2025-01-30 [1] CRAN (R 4.2.3)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.2.3)
## cli 3.6.4 2025-02-13 [1] CRAN (R 4.2.3)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.2.3)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.2.3)
## colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.2.3)
## crayon 1.5.3 2024-06-20 [1] CRAN (R 4.2.3)
## data.table 1.17.0 2025-02-22 [1] CRAN (R 4.2.3)
## devtools * 2.4.5 2022-10-11 [1] CRAN (R 4.2.3)
## digest 0.6.37 2024-08-19 [1] CRAN (R 4.2.3)
## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.2.1)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.2.3)
## evaluate 1.0.3 2025-01-10 [1] CRAN (R 4.2.3)
## farver 2.1.2 2024-05-13 [2] CRAN (R 4.2.3)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.2.3)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.1)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.1)
## fs 1.6.5 2024-10-30 [1] CRAN (R 4.2.3)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
## GenomeInfoDb 1.34.9 2023-02-02 [1] Bioconductor
## GenomeInfoDbData 1.2.9 2023-03-07 [1] Bioconductor
## ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.2.1)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.2.3)
## gtable 0.3.6 2024-10-25 [1] CRAN (R 4.2.3)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.2.1)
## htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.2.3)
## htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.2.1)
## httpuv 1.6.15 2024-03-26 [1] CRAN (R 4.2.3)
## igraph 2.1.4 2025-01-23 [1] CRAN (R 4.2.3)
## IRanges 2.32.0 2022-11-01 [1] Bioconductor
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.1)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.2.3)
## jsonlite 1.9.1 2025-03-03 [1] CRAN (R 4.2.3)
## knitr 1.50 2025-03-16 [1] CRAN (R 4.2.3)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.2.1)
## later 1.4.1 2024-11-27 [1] CRAN (R 4.2.3)
## lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.3)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.2.1)
## lubridate * 1.9.4 2024-12-08 [1] CRAN (R 4.2.3)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.2.3)
## MASS 7.3-58.2 2023-01-23 [2] CRAN (R 4.2.3)
## Matrix 1.6-5 2024-01-11 [1] CRAN (R 4.2.1)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.2.3)
## mgcv 1.8-42 2023-03-02 [2] CRAN (R 4.2.3)
## mime 0.13 2025-03-17 [1] CRAN (R 4.2.3)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.2.3)
## multtest 2.54.0 2022-11-01 [1] Bioconductor
## munsell 0.5.1 2024-04-01 [2] CRAN (R 4.2.3)
## nlme 3.1-162 2023-01-31 [2] CRAN (R 4.2.3)
## pacman 0.5.1 2019-03-11 [1] CRAN (R 4.2.3)
## patchwork * 1.3.0.9000 2025-02-19 [1] Github (thomasp85/patchwork@2695a9f)
## permute 0.9-7 2022-01-27 [1] CRAN (R 4.2.1)
## phyloseq * 1.42.0 2022-11-01 [1] Bioconductor
## pillar 1.10.1 2025-01-07 [1] CRAN (R 4.2.3)
## pkgbuild 1.4.5 2024-10-28 [2] CRAN (R 4.2.3)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.3)
## pkgload 1.4.0 2024-06-28 [1] CRAN (R 4.2.3)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.2.1)
## profvis 0.4.0 2024-09-20 [2] CRAN (R 4.2.3)
## promises 1.3.2 2024-11-28 [1] CRAN (R 4.2.3)
## purrr * 1.0.4 2025-02-05 [1] CRAN (R 4.2.3)
## R6 2.6.1 2025-02-15 [1] CRAN (R 4.2.3)
## Rcpp 1.0.14 2025-01-12 [1] CRAN (R 4.2.3)
## RCurl 1.98-1.17 2025-03-22 [1] CRAN (R 4.2.3)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.2.1)
## remotes 2.5.0 2024-03-17 [1] CRAN (R 4.2.3)
## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.1)
## rhdf5 2.42.1 2023-04-07 [1] Bioconductor
## rhdf5filters 1.10.1 2023-03-24 [1] Bioconductor
## Rhdf5lib 1.20.0 2022-11-01 [1] Bioconductor
## rlang 1.1.5 2025-01-17 [1] CRAN (R 4.2.3)
## rmarkdown 2.29 2024-11-04 [1] CRAN (R 4.2.3)
## rstudioapi 0.17.1 2024-10-22 [2] CRAN (R 4.2.3)
## S4Vectors 0.36.2 2023-02-26 [1] Bioconductor
## sass 0.4.9 2024-03-15 [1] CRAN (R 4.2.3)
## scales 1.3.0 2023-11-28 [1] CRAN (R 4.2.1)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.3)
## shiny 1.10.0 2024-12-14 [1] CRAN (R 4.2.3)
## stringi 1.8.4 2024-05-06 [1] CRAN (R 4.2.3)
## stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.2.1)
## survival 3.5-3 2023-02-12 [2] CRAN (R 4.2.3)
## tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.2.1)
## tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.2.1)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.2.3)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.2.3)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.2.1)
## tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.2.3)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.2.3)
## usethis * 3.0.0 2024-07-29 [2] CRAN (R 4.2.3)
## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.2.1)
## vegan 2.6-10 2025-01-29 [1] CRAN (R 4.2.3)
## withr 3.0.2 2024-10-28 [1] CRAN (R 4.2.3)
## xfun 0.51 2025-02-19 [1] CRAN (R 4.2.3)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.2.3)
## XVector 0.38.0 2022-11-01 [1] Bioconductor
## yaml 2.3.10 2024-07-26 [1] CRAN (R 4.2.3)
## zlibbioc 1.44.0 2022-11-01 [1] Bioconductor
##
## [1] /home/mls528/R/x86_64-pc-linux-gnu-library/4.2
## [2] /programs/R-4.2.3/lib64/R/library
##
## ──────────────────────────────────────────────────────────────────────────────